In [25]:
!pip3 install pyarrow
Defaulting to user installation because normal site-packages is not writeable Collecting pyarrow Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB) Requirement already satisfied: numpy>=1.16.6 in /nexus/posix0/MAGE-flaski/service/posit/home/wangy/.jupyter/python/3.10/lib/python3.10/site-packages (from pyarrow) (1.24.3) Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 48.7 MB/s eta 0:00:00:00:0100:01 Installing collected packages: pyarrow Successfully installed pyarrow-17.0.0 [notice] A new release of pip is available: 24.1.1 -> 24.2 [notice] To update, run: pip install --upgrade pip
In [2]:
import wget
import pandas as pd
import numpy as np
import gzip
import shutil
import os
from pyarrow.parquet import ParquetFile
import pyarrow as pa
In [16]:
# url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct.gz'
# output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics"
# filename = wget.download(url, out=output_directory)
# filename
# filenameunzip = filename.replace('.gz', '')
# with gzip.open(filename, 'rb') as f_in:
# with open(filenameunzip, 'wb') as f_out:
# shutil.copyfileobj(f_in, f_out)
In [5]:
output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics/"
filenameunzip = output_directory + "GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct"
df=pd.read_csv(filenameunzip, skiprows=2, nrows=10, sep="\t")
In [6]:
df
Out[6]:
Name | Description | GTEX-1117F-0226-SM-5GZZ7 | GTEX-1117F-0426-SM-5EGHI | GTEX-1117F-0526-SM-5EGHJ | GTEX-1117F-0626-SM-5N9CS | GTEX-1117F-0726-SM-5GIEN | GTEX-1117F-1326-SM-5EGHH | GTEX-1117F-2426-SM-5EGGH | GTEX-1117F-2526-SM-5GZY6 | ... | GTEX-ZZPU-1126-SM-5N9CW | GTEX-ZZPU-1226-SM-5N9CK | GTEX-ZZPU-1326-SM-5GZWS | GTEX-ZZPU-1426-SM-5GZZ6 | GTEX-ZZPU-1826-SM-5E43L | GTEX-ZZPU-2126-SM-5EGIU | GTEX-ZZPU-2226-SM-5EGIV | GTEX-ZZPU-2426-SM-5E44I | GTEX-ZZPU-2626-SM-5E45Y | GTEX-ZZPU-2726-SM-5NQ8O | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr1_12058_12178 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | chr1_12228_12612 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | chr1_12698_12974 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | chr1_12722_13220 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | chr1_13053_13220 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | chr1_13375_13452 | ENSG00000223972.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | chr1_14502_15004 | ENSG00000227232.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | chr1_15039_15795 | ENSG00000227232.5 | 10 | 8 | 8 | 9 | 3 | 8 | 7 | 23 | ... | 1 | 2 | 1 | 0 | 2 | 1 | 0 | 1 | 1 | 0 |
8 | chr1_15948_16606 | ENSG00000227232.5 | 20 | 7 | 14 | 17 | 8 | 18 | 14 | 34 | ... | 0 | 0 | 5 | 1 | 2 | 3 | 0 | 0 | 2 | 2 |
9 | chr1_16766_16857 | ENSG00000227232.5 | 2 | 0 | 2 | 2 | 7 | 0 | 0 | 1 | ... | 10 | 3 | 4 | 1 | 3 | 0 | 7 | 1 | 1 | 1 |
10 rows × 17384 columns
In [23]:
# url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet'
# output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics"
# filename = wget.download(url, out=output_directory)
In [20]:
filename = output_directory + "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet"
pf = ParquetFile(filename)
first_ten_rows = next(pf.iter_batches(batch_size = 10))
df2 = pa.Table.from_batches([first_ten_rows]).to_pandas()
In [21]:
df2
Out[21]:
Description | GTEX-1117F-0226-SM-5GZZ7 | GTEX-1117F-0426-SM-5EGHI | GTEX-1117F-0526-SM-5EGHJ | GTEX-1117F-0626-SM-5N9CS | GTEX-1117F-0726-SM-5GIEN | GTEX-1117F-1326-SM-5EGHH | GTEX-1117F-2426-SM-5EGGH | GTEX-1117F-2526-SM-5GZY6 | GTEX-1117F-2826-SM-5GZXL | ... | GTEX-ZZPU-1126-SM-5N9CW | GTEX-ZZPU-1226-SM-5N9CK | GTEX-ZZPU-1326-SM-5GZWS | GTEX-ZZPU-1426-SM-5GZZ6 | GTEX-ZZPU-1826-SM-5E43L | GTEX-ZZPU-2126-SM-5EGIU | GTEX-ZZPU-2226-SM-5EGIV | GTEX-ZZPU-2426-SM-5E44I | GTEX-ZZPU-2626-SM-5E45Y | GTEX-ZZPU-2726-SM-5NQ8O | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Name | |||||||||||||||||||||
ENSG00000223972.5_1 | DDX11L1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
ENSG00000223972.5_2 | DDX11L1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.855263 |
ENSG00000223972.5_3 | DDX11L1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
ENSG00000223972.5_4 | DDX11L1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.144737 |
ENSG00000227232.5_1 | WASH7P | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
ENSG00000227232.5_2 | WASH7P | 1.000000 | 18.460526 | 8.394737 | 23.868422 | 0.000000 | 9.078947 | 5.894737 | 5.605263 | 13.447369 | ... | 7.381579 | 12.276316 | 9.644737 | 6.368421 | 8.828947 | 11.592105 | 7.171053 | 19.131578 | 5.960526 | 6.671053 |
ENSG00000227232.5_3 | WASH7P | 1.197368 | 8.460526 | 6.960526 | 11.684211 | 1.539474 | 4.907895 | 1.868421 | 32.184212 | 18.539474 | ... | 5.144737 | 4.697368 | 3.118421 | 4.513158 | 5.868421 | 5.947368 | 4.368421 | 7.763158 | 2.789474 | 5.171053 |
ENSG00000227232.5_4 | WASH7P | 13.710526 | 8.828947 | 9.960526 | 25.052631 | 4.815789 | 6.184211 | 14.947368 | 40.921055 | 21.565788 | ... | 5.421053 | 6.026316 | 12.144737 | 7.697368 | 7.526316 | 8.921053 | 4.842105 | 10.184211 | 3.486842 | 8.236842 |
ENSG00000227232.5_5 | WASH7P | 23.328947 | 4.526316 | 15.078947 | 33.473682 | 17.052631 | 12.500000 | 34.407894 | 48.473682 | 34.250000 | ... | 5.013158 | 19.934212 | 29.223684 | 11.328947 | 11.815789 | 10.894737 | 4.065789 | 13.500000 | 5.355263 | 7.078947 |
ENSG00000227232.5_6 | WASH7P | 20.763159 | 5.723684 | 17.144737 | 23.986841 | 13.105263 | 16.026316 | 22.578947 | 54.618420 | 33.039474 | ... | 12.881579 | 13.855263 | 21.578947 | 15.657895 | 14.118421 | 12.552632 | 5.907895 | 11.092105 | 2.460526 | 11.960526 |
10 rows × 17383 columns
In [29]:
# histo=pd.read_csv(output_directory + "GTEx Portal.csv")
# histo
# df2_columns=df2.columns.str.replace(r'-SM.*', '', regex=True)
# sum(df2_columns.isin(histo['Tissue Sample ID']))
# df2_columns
# not_matching = df2_columns[~df2_columns.isin(histo['Tissue Sample ID'])]
# not_matching
In [30]:
meta=pd.read_csv(output_directory + "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",sep='\t')
meta
Out[30]:
SAMPID | SMATSSCR | SMCENTER | SMPTHNTS | SMRIN | SMTS | SMTSD | SMUBRID | SMTSISCH | SMTSPAX | ... | SME1ANTI | SMSPLTRD | SMBSMMRT | SME1SNSE | SME1PCTS | SMRRNART | SME1MPRT | SMNUM5CD | SMDPMPRT | SME2PCTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GTEX-1117F-0003-SM-58Q7G | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | GTEX-1117F-0003-SM-5DWSB | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | GTEX-1117F-0003-SM-6WBT7 | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | GTEX-1117F-0011-R10a-SM-AHZ7F | NaN | B1, A1 | NaN | NaN | Brain | Brain - Frontal Cortex (BA9) | 0009834 | 1193.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | GTEX-1117F-0011-R10b-SM-CYKQ8 | NaN | B1, A1 | NaN | 7.2 | Brain | Brain - Frontal Cortex (BA9) | 0009834 | 1193.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22946 | K-562-SM-E9EZC | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | 26289400.0 | 27814300.0 | 0.002441 | 26121600.0 | 49.8400 | 0.006370 | 0.995167 | NaN | 0.0 | 50.2621 |
22947 | K-562-SM-E9EZI | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | 26653800.0 | 28341700.0 | 0.002336 | 26553400.0 | 49.9056 | 0.006806 | 0.994802 | NaN | 0.0 | 50.2046 |
22948 | K-562-SM-E9EZO | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | 14317500.0 | 15168000.0 | 0.001731 | 14163500.0 | 49.7298 | 0.006662 | 0.994935 | NaN | 0.0 | 50.2412 |
22949 | K-562-SM-E9EZT | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | 25459900.0 | 26906500.0 | 0.002130 | 25259100.0 | 49.8020 | 0.007145 | 0.994828 | NaN | 0.0 | 50.2529 |
22950 | K-562-SM-E9EZZ | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | 22341200.0 | 23740600.0 | 0.001867 | 22232600.0 | 49.8781 | 0.006861 | 0.993576 | NaN | 0.0 | 50.2929 |
22951 rows × 63 columns
In [32]:
sum(df2.columns.isin(meta['SAMPID']))
Out[32]:
17382
In [33]:
age=pd.read_csv(output_directory + "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt",sep='\t')
age
Out[33]:
SUBJID | SEX | AGE | DTHHRDY | |
---|---|---|---|---|
0 | GTEX-1117F | 2 | 60-69 | 4.0 |
1 | GTEX-111CU | 1 | 50-59 | 0.0 |
2 | GTEX-111FC | 1 | 60-69 | 1.0 |
3 | GTEX-111VG | 1 | 60-69 | 3.0 |
4 | GTEX-111YS | 1 | 60-69 | 0.0 |
... | ... | ... | ... | ... |
975 | GTEX-ZYY3 | 2 | 60-69 | 4.0 |
976 | GTEX-ZZ64 | 1 | 20-29 | 0.0 |
977 | GTEX-ZZPT | 1 | 50-59 | 4.0 |
978 | GTEX-ZZPU | 2 | 50-59 | 0.0 |
979 | K-562 | 2 | 50-59 | NaN |
980 rows × 4 columns
In [49]:
meta['g1'] = meta['SAMPID'].str.split('-', n=2, expand=True)[0]
meta['g2'] = meta['SAMPID'].str.split('-', n=2, expand=True)[1]
meta['SAMP_Group'] = meta[['g1', 'g2']].agg('-'.join, axis=1)
In [51]:
meta=pd.merge(meta,age,left_on=["SAMP_Group"],right_on="SUBJID", how="left")
meta
Out[51]:
SAMPID | SMATSSCR | SMCENTER | SMPTHNTS | SMRIN | SMTS | SMTSD | SMUBRID | SMTSISCH | SMTSPAX | ... | SMNUM5CD | SMDPMPRT | SME2PCTS | SAMP_Group | g1 | g2 | SUBJID | SEX | AGE | DTHHRDY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GTEX-1117F-0003-SM-58Q7G | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | GTEX-1117F | GTEX | 1117F | GTEX-1117F | 2 | 60-69 | 4.0 |
1 | GTEX-1117F-0003-SM-5DWSB | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | GTEX-1117F | GTEX | 1117F | GTEX-1117F | 2 | 60-69 | 4.0 |
2 | GTEX-1117F-0003-SM-6WBT7 | NaN | B1 | NaN | NaN | Blood | Whole Blood | 0013756 | 1188.0 | NaN | ... | NaN | NaN | NaN | GTEX-1117F | GTEX | 1117F | GTEX-1117F | 2 | 60-69 | 4.0 |
3 | GTEX-1117F-0011-R10a-SM-AHZ7F | NaN | B1, A1 | NaN | NaN | Brain | Brain - Frontal Cortex (BA9) | 0009834 | 1193.0 | NaN | ... | NaN | NaN | NaN | GTEX-1117F | GTEX | 1117F | GTEX-1117F | 2 | 60-69 | 4.0 |
4 | GTEX-1117F-0011-R10b-SM-CYKQ8 | NaN | B1, A1 | NaN | 7.2 | Brain | Brain - Frontal Cortex (BA9) | 0009834 | 1193.0 | NaN | ... | NaN | NaN | NaN | GTEX-1117F | GTEX | 1117F | GTEX-1117F | 2 | 60-69 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22946 | K-562-SM-E9EZC | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | NaN | 0.0 | 50.2621 | K-562 | K | 562 | K-562 | 2 | 50-59 | NaN |
22947 | K-562-SM-E9EZI | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | NaN | 0.0 | 50.2046 | K-562 | K | 562 | K-562 | 2 | 50-59 | NaN |
22948 | K-562-SM-E9EZO | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | NaN | 0.0 | 50.2412 | K-562 | K | 562 | K-562 | 2 | 50-59 | NaN |
22949 | K-562-SM-E9EZT | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | NaN | 0.0 | 50.2529 | K-562 | K | 562 | K-562 | 2 | 50-59 | NaN |
22950 | K-562-SM-E9EZZ | NaN | NaN | NaN | NaN | Bone Marrow | Cells - Leukemia cell line (CML) | EFO_0002067 | NaN | NaN | ... | NaN | 0.0 | 50.2929 | K-562 | K | 562 | K-562 | 2 | 50-59 | NaN |
22951 rows × 70 columns
In [ ]: